This notebook trains a random forest using the benchmark training data (which itself is a combination of random positions, and false positives from a previous, sub-optimal classifier).
This new classifier (which is very good) is then used to create a bootstrapped sample of 20,000 negative examples uniformly distributed in difficulty
Before you run all the cells: some of the cells in this notebook involve time-consuming calculations, both locally and on PiCloud.
In [1]:
%pylab
import json
import random
import itertools
import os
import sys
import numpy as np
from sklearn.metrics import auc_score
from sklearn.utils import resample
from bubbly.wiserf import WiseRF
from bubbly.extractors import MultiViewExtractor, ManyManyExtractors
from bubbly.util import roc_curve
from bubbly.dr1 import WideLocationGenerator, highest_quality_on_params
from bubbly.hyperopt import fmin, rf_space, auc_below_fpos
os.environ['WISERF_ROOT'] = '/Users/beaumont/WiseRF-1.5.9-macosx-x86_64-rc1'
In [2]:
data = json.load(open('../models/benchmark_training_data.json'))
lg = WideLocationGenerator(mod3 = 1)
data['pos'] = filter(lambda x: lg.valid_longitude(x[0]), highest_quality_on_params())
for k, v in data.items():
data[k] = sorted(v)
ex = MultiViewExtractor(ManyManyExtractors())
ex.shp = (60, 60)
In [3]:
def _xy(ex, on, off):
x = np.vstack(ex.extract(*o).ravel().astype(np.float32) for o in on + off)
x = np.nan_to_num(x)
y = np.hstack((np.ones(len(on), dtype=np.int), np.zeros(len(off), dtype=np.int)))
return x, y
npos = len(data['pos'])
xtrain, ytrain = _xy(ex, data['pos'], random.sample(data['neg'], npos))
warning Building the validation set takes a while (~1 hour)
In [ ]:
xvalidate, yvalidate = _xy(ex, data['cv_pos'], data['cv_neg'])
In [4]:
def rf_objective(**params):
clf = WiseRF(**params)
clf.fit(xtrain, ytrain)
df = clf.decision_function(xvalidate).ravel()
return -auc_below_fpos(yvalidate, df, .0005), clf
In [7]:
#this will loop until interrupted
for best, best_params, clf in fmin(rf_objective, rf_space):
print best, best_params
sys.stdout.flush()
roc_curve(yvalidate, clf.decision_function(xvalidate), label='val', lw=3)
roc_curve(ytrain, clf.decision_function(xtrain), label='train', lw=3)
xlim(0, .002)
legend(loc='lower right')
show()
In [8]:
best_params = {'max_features': 'auto', 'min_samples_split': 4, 'n_jobs': 2, 'criterion': 'infogain', 'n_estimators': 800}
clf = WiseRF(**best_params).fit(xtrain, ytrain)
In [9]:
df = clf.decision_function(xvalidate).ravel()
In [12]:
from skimage.util.montage import montage2d
def montage(arrs):
print "montaging %i images" % len(arrs)
print 'image dim:', arrs[0].shape
r, g, b = tuple(montage2d(np.array([a[:, :, i] for a in arrs]))
for i in range(3))
return np.dstack((r, g, b)).astype(np.uint8)
In [14]:
from bubbly.extractors import RGBExtractor
def _ex(*params):
p = list(params)
r1 = rgb.extract(*p)
p[-1] *= 2.5
r2 = rgb.extract(*p)
return np.hstack((r1, r2))
rgb = RGBExtractor()
rgb.shp = (100, 100)
In [11]:
on_ind = np.argsort(df[yvalidate == 1])
off_ind = np.argsort(df[yvalidate == 0])[::-1]
figure(figsize=(15, 10))
im = montage([_ex(*data['cv_pos'][i]) for i in on_ind[:25]])
imshow(im, origin='upper')
title("Hard Positives")
show()
figure(figsize=(15, 10))
im = montage([_ex(*data['cv_neg'][i]) for i in off_ind[:16]])
imshow(im, origin='upper')
title("Hard Negatives")
show()
In [16]:
import cPickle
with open ('good_classifier.pkl', 'w') as outfile:
cPickle.dump(clf, outfile)
In [18]:
np.savez_compressed('training_data.npz', xtrain=xtrain, ytrain=ytrain, xvalidate=xvalidate, yvalidate=yvalidate)
We'll use the classifier we just made to classify 200,000 nominally negative examples, from which we can build the final classifiers
In [2]:
import cPickle as pickle
clf = pickle.load(open('good_classifier.pkl'))
In [3]:
def getoff(loc):
return list(itertools.islice(loc.negatives_iterator(), 0, 100000))
loc1 = WideLocationGenerator(0)
loc2 = WideLocationGenerator(1)
offs = getoff(loc1) + getoff(loc2)
In [10]:
from bubbly.model import Model
ex = MultiViewExtractor(ManyManyExtractors())
ex.shp = (60, 60)
lg = WideLocationGenerator(mod3 = 1)
offs = sorted(offs) # minimize io
m = Model(ex, lg, clf)
offs_df = m.cloud_decision_function(offs, workers=100)
It looks like we need to ignore the 50 examples with highest decision fucntion values, since these are mostly bubbles not in DR1.
The two mosaics show the 98 regions with highest decision functions
In [15]:
ind = np.argsort(offs_df)[::-1]
figure(figsize=(15, 10))
imshow(montage([_ex(*offs[i]) for i in ind[:49]]), origin='upper')
show()
figure(figsize=(15, 10))
imshow(montage([_ex(*offs[i]) for i in ind[49:49*2]]), origin='upper')
Out[15]:
The proposed cutoff
In [25]:
cutoff = offs_df[ind[100]]
hist(offs_df, histtype='step', bins = 50)
axvline(cutoff, color='k')
Out[25]:
In [46]:
ind = np.argsort(np.array(offs)[:, 1])
bootstrap_offs = {'off_params' : sorted(offs),
'off_score' : offs_df[ind].tolist(),
'on_params' : sorted(list(highest_quality_on_params()))}
with open('../models/bootstrapped_labels.json', 'w') as outfile:
json.dump(bootstrap_offs, outfile)